import pandas as pd
import altair as alt
import geopandas as gpd
import datetime
import ast
import json
import plotly.express as px
import plotly.graph_objects as go
Датасет містить дані про всі матчі футбольного турніру Ліги Чемпіонів 2020/2021. Для кожного матчу навні статистичні дані та список подій(голи, жовті/червоні картки, заміни)
df = pd.read_csv('data/champions_league_2020-2021_results.csv')
df.head()
| stage | date | pens | pens_home_score | pens_away_score | team_name_home | team_name_away | team_home_score | team_away_score | possession_home | ... | shots_on_target_away | duels_won_home | duels_won_away | prediction_team_home_win | prediction_draw | prediction_team_away_win | location | events_list | lineup_home | lineup_away | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Final | 29.05.2021 | False | False | False | Manchester City | Chelsea | 0 | 1 | 61% | ... | 2 | 50% | 50% | 55% | 3% | 42% | Estádio Do Dragão | [{'event_team': 'home', 'event_time': " 34' ",... | [{'player_name': ' Sterling ', 'player_number'... | [{'player_name': ' Werner ', 'player_number': ... |
| 1 | Semi-finals second leg | 05.05.2021 | False | False | False | Chelsea | Real Madrid | 2 | 0 | 32% | ... | 5 | 52% | 49% | 43% | 3% | 54% | Stamford Bridge | [{'event_team': 'home', 'event_time': " 14' ",... | [{'player_name': ' Werner ', 'player_number': ... | [{'player_name': ' Hazard ', 'player_number': ... |
| 2 | Semi-finals second leg | 04.05.2021 | False | False | False | Manchester City | PSG | 2 | 0 | 44% | ... | 0 | 53% | 47% | 44% | 3% | 53% | Etihad Stadium | [{'event_team': 'home', 'event_time': " 11' ",... | [{'player_name': ' Silva ', 'player_number': '... | [{'player_name': ' Neymar ', 'player_number': ... |
| 3 | Semi-finals first leg | 28.04.2021 | False | False | False | PSG | Manchester City | 1 | 2 | 40% | ... | 6 | 47% | 53% | 65% | 4% | 31% | Parc des Princes | [{'event_team': 'home', 'event_time': " 15' ",... | [{'player_name': ' Mbappe ', 'player_number': ... | [{'player_name': ' Foden ', 'player_number': '... |
| 4 | Semi-finals first leg | 27.04.2021 | False | False | False | Real Madrid | Chelsea | 1 | 1 | 51% | ... | 5 | 49% | 52% | 61% | 4% | 35% | Alfredo Di Stéfano | [{'event_team': 'away', 'event_time': " 14' ",... | [{'player_name': ' Junior ', 'player_number': ... | [{'player_name': ' Werner ', 'player_number': ... |
5 rows × 24 columns
events = df.events_list.apply(ast.literal_eval)
df.events_list = df.events_list.apply(ast.literal_eval)
events_df = pd.DataFrame.from_records(df.events_list.apply(pd.Series).stack().tolist())
goals_by_minute = events_df.copy()[events_df['event_type'] == 'Goal']#.groupby('event_time').size().reset_index(name='count')
cards_by_minute = events_df.copy()[(events_df['event_type'] == 'Yellow card') | (events_df['event_type'] == 'Red card')]#.groupby('event_time').size().reset_index(name='count')
goals_by_minute['event_type'] = 'Goals'
goals_by_minute['event_time'] = goals_by_minute['event_time'].apply(lambda x: x[:-2]).astype('int')
goals_by_minute = goals_by_minute[goals_by_minute['event_time'] < 90]
cards_by_minute['event_type'] = 'Cards'
cards_by_minute['event_time'] = cards_by_minute['event_time'].apply(lambda x: x[:-2]).astype('int')
cards_by_minute = cards_by_minute[cards_by_minute['event_time'] < 90]
events_by_minute = pd.concat([goals_by_minute, cards_by_minute], ignore_index=True)
europe = gpd.read_file("data/europe.geojson")
df['team_name_home'] = df['team_name_home'].apply(lambda x: x.strip())
df['team_name_away'] = df['team_name_away'].apply(lambda x: x.strip())
group_stage_teams = list(set(df['team_name_home'].tolist()))
round_of_16_teams = list(set(df[(df['stage'] == ' Round of 16 first leg ') | (df['stage'] == ' Round of 16 second leg ')]['team_name_home'].tolist()))
quarter_finals_teams = list(set(df[(df['stage'] == ' Quarter-finals first leg ') | (df['stage'] == ' Quarter-finals second leg ')]['team_name_home'].tolist()))
semi_finals_teams = list(set(df[(df['stage'] == ' Semi-finals first leg ') | (df['stage'] == ' Semi-finals second leg ')]['team_name_home'].tolist()))
final_teams = list(set(df[(df['stage'] == ' Final ')]['team_name_home'].tolist() + df[(df['stage'] == ' Final ')]['team_name_away'].tolist()))
with open("data/teams_countries.json", 'r', encoding='utf-8') as f:
team_countries = json.load(f)
teams_df = pd.DataFrame.from_dict(team_countries, orient='index').reset_index().rename(columns={'index':'teams', 0: 'country'})
country_teams = teams_df.groupby('country',as_index=False)['teams'].aggregate(lambda x: list(x))
country_teams.rename(columns={'country': 'ISO2'}, inplace=True)
country_teams['teams_count'] = country_teams['teams'].str.len()
country_teams_16 = pd.DataFrame(country_teams['teams'].apply(lambda x: [y for y in x if y in round_of_16_teams]))
country_teams_16['ISO2'] = country_teams['ISO2']
country_teams_16['teams_count'] = country_teams_16['teams'].str.len()
country_teams_8 = pd.DataFrame(country_teams['teams'].apply(lambda x: [y for y in x if y in quarter_finals_teams]))
country_teams_8['ISO2'] = country_teams['ISO2']
country_teams_8['teams_count'] = country_teams_8['teams'].str.len()
country_teams_4 = pd.DataFrame(country_teams['teams'].apply(lambda x: [y for y in x if y in semi_finals_teams]))
country_teams_4['ISO2'] = country_teams['ISO2']
country_teams_4['teams_count'] = country_teams_4['teams'].str.len()
country_teams_2 = pd.DataFrame(country_teams['teams'].apply(lambda x: [y for y in x if y in final_teams]))
country_teams_2['ISO2'] = country_teams['ISO2']
country_teams_2['teams_count'] = country_teams_2['teams'].str.len()
country_teams_stages = pd.concat([country_teams, country_teams_16, country_teams_8, country_teams_4, country_teams_2], ignore_index=True)
merged_32 = europe.merge(country_teams, on='ISO2', how='left').fillna("")
merged_32['stage_name'] = 'Group stage'
merged_16 = europe.merge(country_teams_16, on='ISO2', how='left').fillna("")
merged_16['stage_name'] = 'Round of 16'
merged_8 = europe.merge(country_teams_8, on='ISO2', how='left').fillna("")
merged_8['stage_name'] = 'Quarterfinal'
merged_4 = europe.merge(country_teams_4, on='ISO2', how='left').fillna("")
merged_4['stage_name'] = 'Semifinal'
merged_2 = europe.merge(country_teams_2, on='ISO2', how='left').fillna("")
merged_2['stage_name'] = 'Final'
merged = pd.concat([merged_32, merged_16, merged_8, merged_4, merged_2], ignore_index=True)
home_stats = df[['team_name_home', 'team_home_score', 'team_away_score', 'possession_home', 'total_shots_home', 'shots_on_target_home', 'duels_won_home']]
away_stats = df[['team_name_away', 'team_away_score', 'team_home_score', 'possession_away', 'total_shots_away', 'shots_on_target_away', 'duels_won_away']]
home_stats = home_stats.rename(columns={'team_name_home': 'team_name', 'team_home_score': 'scored_goals', 'team_away_score': 'conceded_goals',
'possession_home': 'posession', 'total_shots_home': 'total_shots',
'shots_on_target_home': 'shots_on_target', 'duels_won_home': 'duels_won'})
away_stats = away_stats.rename(columns={'team_name_away': 'team_name', 'team_away_score': 'scored_goals', 'team_home_score': 'conceded_goals',
'possession_away': 'posession', 'total_shots_away': 'total_shots',
'shots_on_target_away': 'shots_on_target', 'duels_won_away': 'duels_won'})
stats = pd.concat([home_stats, away_stats], ignore_index=True)
stats['duels_won'] = stats['duels_won'].str.strip().str.rstrip('%').astype('float')
stats['posession'] = stats['posession'].str.strip().str.rstrip('%').astype('float')
stats['team_name'] = stats['team_name'].str.strip()
stats
| team_name | scored_goals | conceded_goals | posession | total_shots | shots_on_target | duels_won | |
|---|---|---|---|---|---|---|---|
| 0 | Manchester City | 0 | 1 | 61.0 | 7 | 1 | 50.0 |
| 1 | Chelsea | 2 | 0 | 32.0 | 15 | 5 | 52.0 |
| 2 | Manchester City | 2 | 0 | 44.0 | 12 | 5 | 53.0 |
| 3 | PSG | 1 | 2 | 40.0 | 10 | 4 | 47.0 |
| 4 | Real Madrid | 1 | 1 | 51.0 | 9 | 1 | 49.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 245 | Krasnodar | 1 | 1 | 37.0 | 10 | 3 | 51.0 |
| 246 | Ferencváros | 1 | 5 | 38.0 | 8 | 2 | 38.0 |
| 247 | Borussia Dortmund | 1 | 3 | 65.0 | 14 | 7 | 53.0 |
| 248 | Club Brugge | 2 | 1 | 48.0 | 13 | 5 | 55.0 |
| 249 | Juventus | 2 | 0 | 53.0 | 10 | 6 | 59.0 |
250 rows × 7 columns
stats = stats.groupby('team_name').mean().reset_index()
stats['shot_accuracy'] = stats['shots_on_target'] / stats['total_shots'] * 100.0
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaled_stats = stats.copy()
scaled_stats['scored_goals'] = scaler.fit_transform(scaled_stats['scored_goals'].values.reshape(-1, 1)) * 100.0
scaled_stats['conceded_goals'] = scaler.fit_transform(scaled_stats['conceded_goals'].values.reshape(-1, 1)) * 100.0
scaled_stats['total_shots'] = scaler.fit_transform(scaled_stats['total_shots'].values.reshape(-1, 1)) * 100.0
scaled_stats['shots_on_target'] = scaler.fit_transform(scaled_stats['shots_on_target'].values.reshape(-1, 1)) * 100.0
categories = ['scored_goals', 'conceded_goals', 'posession', 'total_shots', 'shots_on_target', 'duels_won', 'shot_accuracy', 'scored_goals']
figs = {
team : go.Scatterpolar(
r=scaled_stats[scaled_stats['team_name'] == team].values.tolist()[0][1:] + [scaled_stats[scaled_stats['team_name'] == team].values.tolist()[0][1]],
theta=categories,
fill='toself',
hoveron="points",
hovertemplate =
'<b>%{text}</b>',
text = [f'{stats[stats["team_name"] == team].scored_goals.values[0]}',
f'{stats[stats["team_name"] == team].conceded_goals.values[0]}',
f'{stats[stats["team_name"] == team].posession.values[0]}',
f'{stats[stats["team_name"] == team].total_shots.values[0]}',
f'{stats[stats["team_name"] == team].shots_on_target.values[0]}',
f'{stats[stats["team_name"] == team].duels_won.values[0]}',
f'{stats[stats["team_name"] == team].shot_accuracy.values[0]}',
f'{stats[stats["team_name"] == team].scored_goals.values[0]}'],
name=team
) for team in scaled_stats.team_name.values.tolist()
}
updatemenus=[
{
"buttons": [
{
"label": team,
"method": "restyle",
"args": [
{
"r": [figs[team].r],
"name": [figs[team].name],
"text": [figs[team].text]
}, [0]
],
}
for team in scaled_stats.team_name.values.tolist()
],
"y": 0.7,
},
{
"buttons": [
{
"label": team,
"method": "restyle",
"args": [
{
"r": [figs[team].r],
"name": [figs[team].name],
"text": [figs[team].text]
}, [1]
],
}
for team in scaled_stats.team_name.values.tolist()[1:] + scaled_stats.team_name.values.tolist()[:1]
],
"x": 1.4,
"y": 0.7,
"xanchor": "right",
},
]
Наступна візуалізація показує розподіл подій(голів та карток) по часовій шкалі футбольного матчу(хвилини). Було вирішено розбити часову шкалу на інтервали по 5 хвилин, через велику кількість стовців у іншому випадку. Отримана візуалізація показує загальну кількість подій, а також допомагає прослідкувати певні тенденції(наприклад зростання кількості карток ближче до кінця матчу).
input_dropdown = alt.binding_select(options=['Goals', 'Cards'], name='Stage: ')
selection = alt.selection_single(fields=['event_type'], bind=input_dropdown, init={'event_type':'Goals'})
alt.Chart(events_by_minute).transform_filter(alt.FieldRangePredicate(field='event_time', range=[1, 90])).mark_bar(
).encode(
x=alt.X("event_time", title='minute', bin=alt.Bin(maxbins=20)),
y=alt.Y("count()", title='count'),
color=alt.ColorValue('#8c6bb1'),
tooltip=[alt.Tooltip('count()', title="Count")]
).add_selection(selection).transform_filter(
selection
).configure_title(
fontSize=20,
color='grey'
).properties( title = alt.TitleParams(
text = 'Match events by minute',
anchor = "middle",
dy=-50,
),
height=500, width=700
)
Наступна візуалізація показує карту з кольоровою шкалою, яка відповідає кількості команд з країни на даному етапі. Етап турніру можна вибирати інтерактивно. Завданням було показати успішність команд з кожної конкретної країни. Розглядав варіант з використанням bar chart, але оскільки ці дані можна представити як географічні, вибрав візуалізацію на мапі.
input_dropdown = alt.binding_select(options=['Group stage', 'Round of 16', 'Quarterfinal', 'Semifinal', 'Final'], name='Stage: ')
selection = alt.selection_single(fields=['stage_name'], bind=input_dropdown, init={'stage_name':'Group stage'})
alt.Chart(merged).project(type='equalEarth').mark_geoshape(stroke='grey').encode(
color = alt.Color('teams_count:Q', scale = alt.Scale(range=['#bcbddc','#9e9ac8','#807dba','#6a51a3','#4a1486']), legend=alt.Legend(title="Number of teams")),
tooltip=[alt.Tooltip('NAME', title="country"), alt.Tooltip('teams_count:Q', title="number of teams"), 'teams:N'],
).add_selection(selection).transform_filter(
selection
).configure_legend(
titleFontSize=14,
titleColor='grey',
labelFontSize=12,
labelFontWeight='bold',
labelColor='grey'
).configure_title(
fontSize=20,
color='grey'
).properties(
title = alt.TitleParams(
text = 'Number of teams from each country',
anchor = "middle",
dy=-50
),
width=700, height=500
).configure_view(strokeWidth=0)
Ця візуалізація дозволяє порівняти між собою середні статистичні дані будь-яких двох команд. Для цього я вибрав radar chart, оскільки він дозволяє одночасно порівняти багато показників, а також зразу показує сильні та слабкі сторони кожної команди, а також показники в яких між командами велика різниця. Недоліком може бути те, що в показників різні числові шкали(одні в відсотках, інші - в звичайний числах, тому довелось змасштабувати всі дані в одну шкалу від 0 до 100(хоч для числових показники в tooltip відображаютсья оригінальне значення)). Проте, на мою думку, візуалізація виконує своє призначення - передає в яких компонентах одна команда краща чи гірша ніж інша.
fig = go.Figure()
fig.add_trace(figs['Ajax'])
fig.add_trace(figs['Atalanta'])
fig.update_layout(
title="Team head to head stats comparison",
title_x=0.5,
hovermode='x',
updatemenus=updatemenus,
polar=dict(
radialaxis=dict(
visible=False,
)),
showlegend=True
)
fig.show()
!jupyter nbconvert --execute --to html main.ipynb
[NbConvertApp] Converting notebook main.ipynb to html [NbConvertApp] Executing notebook with kernel: hw3 [NbConvertApp] Writing 11973203 bytes to main.html